In [1]:
from bertopic import BERTopic

topic_model = BERTopic.load("/home/zhhuang/climate_policy_paper/code/model_save/bert_topic_except_ecolex_model")
/home/zhhuang/anaconda3/envs/climatepolicy/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
In [2]:
import pandas as pd

df = pd.read_excel("/home/zhhuang/climate_policy_paper/code/data/Topic_docs_time_except_ecolex.xlsx")
docs, timestamp = df["docs"].to_list(), df["Year"].to_list()
In [3]:
topic_model.get_topic_info()['Topic']
topic_model.get_document_info(docs)[["Topic", "Name", "Top_n_words", "Probability", "Representative_document"]]
Out[3]:
Topic Name Top_n_words Probability Representative_document
0 -1 -1_energy_emission_development_support energy - emission - development - support - me... 0.767967 False
1 -1 -1_energy_emission_development_support energy - emission - development - support - me... 0.936575 False
2 -1 -1_energy_emission_development_support energy - emission - development - support - me... 0.382572 False
3 -1 -1_energy_emission_development_support energy - emission - development - support - me... 0.798235 False
4 -1 -1_energy_emission_development_support energy - emission - development - support - me... 0.356415 False
... ... ... ... ... ...
13585 3 3_vehicle_passenger_mobility_purchase vehicle - passenger - mobility - purchase - fr... 0.146053 False
13586 1 1_energy_appliance_vehicle_lamp energy - appliance - vehicle - lamp - househol... 0.067121 False
13587 1 1_energy_appliance_vehicle_lamp energy - appliance - vehicle - lamp - househol... 0.066461 False
13588 -1 -1_energy_emission_development_support energy - emission - development - support - me... 0.733766 False
13589 -1 -1_energy_emission_development_support energy - emission - development - support - me... 0.371204 False

13590 rows × 5 columns

In [4]:
counts = {}
for doc in docs:
    for word in doc.split():
        counts[word] = counts.get(word, 0) + 1
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)
for i in range(100):
    word, count = items[i]
    print("{0:<10}{1:>5}".format(word, count))
energy    25647
emission   5860
renewable  5738
efficiency 5610
development 5520
plan       5103
project    4917
electricity 4797
power      4432
system     4222
sector     4168
gas        4076
national   4020
building   3864
support    3836
policy     3601
measure    3558
fuel       3220
vehicle    3166
include    3109
reduce     3099
target     3064
government 3042
increase   2919
technology 2897
standard   2895
tax        2716
public     2603
production 2566
investment 2552
climate    2550
set        2489
programme  2488
environmental 2486
source     2466
reduction  2451
provide    2398
promote    2392
heat       2386
strategy   2326
consumption 2300
transport  2292
``         2265
aim        2246
program    2160
carbon     2157
requirement 2149
management 2133
industry   2121
establish  2075
sustainable 2066
resource   2001
action     1988
company    1935
develop    1925
solar      1908
supply     1874
market     1867
implementation 1847
plant      1844
capacity   1844
improve    1821
generation 1804
level      1780
cost       1778
country    1773
electric   1758
implement  1753
fund       1730
economic   1714
scheme     1711
objective  1677
activity   1623
product    1600
achieve    1556
change     1530
air        1514
service    1500
wind       1493
''         1480
goal       1455
efficient  1454
construction 1449
base       1448
ensure     1443
water      1441
build      1422
framework  1393
greenhouse 1390
price      1381
total      1374
environment 1372
green      1371
require    1353
oil        1327
natural    1312
equipment  1310
grant      1297
installation 1286
report     1285
In [5]:
similar_topics, similarity = topic_model.find_topics("Transport", top_n=5)
topic_model.get_topic(similar_topics[0])
Out[5]:
[('vehicle', 0.08605256916422592),
 ('passenger', 0.026346936494183622),
 ('mobility', 0.02305217605568082),
 ('purchase', 0.019652099518188953),
 ('freight', 0.014857606931569211),
 ('hybrid', 0.014231897748857608),
 ('battery', 0.013699978374273164),
 ('traffic', 0.013663290669214097),
 ('truck', 0.013651079201795197),
 ('railway', 0.013084886660973443)]
In [6]:
similar_topics, similarity = topic_model.find_topics("Industry", top_n=5)
topic_model.get_topic(similar_topics[0])
Out[6]:
[('excise', 0.037326874703956),
 ('diesel', 0.02948705047722744),
 ('petroleum', 0.02876827570041448),
 ('petrol', 0.020565038389434888),
 ('mw', 0.02022796339307252),
 ('taxation', 0.019770074703999285),
 ('hydropower', 0.019591516906273244),
 ('geothermal', 0.018945125318320953),
 ('gasoline', 0.018319656785637222),
 ('generation', 0.016250492549903513)]
In [7]:
similar_topics, similarity = topic_model.find_topics("Energy systems", top_n=5)
topic_model.get_topic(similar_topics[0])
Out[7]:
[('refrigeration', 0.21798700806101987),
 ('refrigerant', 0.13986385917107486),
 ('conditioning', 0.10668004911735993),
 ('partnership', 0.0570654327070873),
 ('preventative', 0.04393385580956857),
 ('recuperation', 0.04135448989597232),
 ('certification', 0.039874170027037185),
 ('dehumidifiers', 0.03898157629443045),
 ('regenerative', 0.03737406607839815),
 ('depleting', 0.03548307787665277)]
In [8]:
similar_topics, similarity = topic_model.find_topics("Buildings", top_n=5)
topic_model.get_topic(similar_topics[0])
Out[8]:
[('energy', 0.02458405283418319),
 ('appliance', 0.020051290170197984),
 ('vehicle', 0.012396776541827042),
 ('lamp', 0.012010329298167421),
 ('household', 0.011809807727682293),
 ('scheme', 0.010853864034571324),
 ('refrigerator', 0.010808802508147669),
 ('water', 0.009855005391131003),
 ('programme', 0.009434125596781176),
 ('equipment', 0.009407178494799322)]
In [9]:
similar_topics, similarity = topic_model.find_topics("Agriculture, Forestry and Other Land Use", top_n=5)
topic_model.get_topic(similar_topics[0])
Out[9]:
[('refrigeration', 0.21798700806101987),
 ('refrigerant', 0.13986385917107486),
 ('conditioning', 0.10668004911735993),
 ('partnership', 0.0570654327070873),
 ('preventative', 0.04393385580956857),
 ('recuperation', 0.04135448989597232),
 ('certification', 0.039874170027037185),
 ('dehumidifiers', 0.03898157629443045),
 ('regenerative', 0.03737406607839815),
 ('depleting', 0.03548307787665277)]
In [10]:
len(docs)
Out[10]:
13590
In [11]:
import os
images_path = "/home/zhhuang/climate_policy_paper/paper_images"
if not os.path.exists(images_path):
    os.makedirs(images_path)
In [12]:
import plotly.io as pio
pio.kaleido.scope.default_format = "svg"
# pio.kaleido.scope.mathjax = "https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js
In [13]:
fig = topic_model.visualize_barchart(top_n_topics=20, n_words=10, width=300, height=300)
pio.write_image(fig, '/home/zhhuang/climate_policy_paper/paper_images/topic_except_ecolex_barchart.svg')

# fig = topic_model.visualize_barchart(top_n_topics = 20, n_words=10, width = 300, height= 300)
# fig.write_html("/home/zhhuang/climate_policy_paper/paper_images/topic_barchart.png", engine="kaleido")
# img_bytes = fig.to_image(format="png", width=600, height=350, scale=2)
# Image(img_bytes)
fig
In [14]:
# topic_model.visualize_barchart(top_n_topics = 20, n_words=10,width = 300, height= 300)
In [15]:
fig2 = topic_model.visualize_heatmap()
# fig = topic_model.visualize_barchart(top_n_topics = 20, n_words=10, width = 300, height= 300)
pio.write_image(fig2, '/home/zhhuang/climate_policy_paper/paper_images/topic_except_ecolex_heatmap.svg')
fig2
In [16]:
fig3 = topic_model.visualize_topics()
pio.write_image(fig3, '/home/zhhuang/climate_policy_paper/paper_images/topic_except_ecolex_visualize_topics.svg')
fig3
In [17]:
# hierarchical_topics = topic_model.hierarchical_topics(docs)
# # print(hierarchical_topics)
# with pd.ExcelWriter("Topic_hierarchical_topics_except_ecolex.xlsx", engine='xlsxwriter',
#                     engine_kwargs={'options': {'strings_to_urls': False}}) as writer:
#     hierarchical_topics.to_excel(writer)
hierarchical_topics = pd.read_excel("/home/zhhuang/climate_policy_paper/code/Topic_hierarchical_topics_except_ecolex.xlsx")
fig4 = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
pio.write_image(fig4, '/home/zhhuang/climate_policy_paper/paper_images/topic_except_ecolex_hierarchical_topics.svg')
fig4
In [18]:
for index, i in enumerate(timestamp):
    if i == '0':
        timestamp[index] = '2020'
    else:
        timestamp[index] = str(i)
topics_over_time = topic_model.topics_over_time(docs, timestamp, datetime_format="%Y", nr_bins=20)
with pd.ExcelWriter("Topic_except_ecolex_topics_over_time.xlsx", engine='xlsxwriter',
                    engine_kwargs={'options': {'strings_to_urls': False}}) as writer:
    topics_over_time.to_excel(writer)
20it [4:26:28, 799.41s/it] 
In [19]:
topics_over_time = pd.read_excel("/home/zhhuang/climate_policy_paper/code/Topic_except_ecolex_topics_over_time.xlsx")
fig5 = topic_model.visualize_topics_over_time(topics_over_time)
pio.write_image(fig5, '/home/zhhuang/climate_policy_paper/paper_images/topic_except_ecolex_visualize_topics_over_time.svg')
fig5
In [ ]: